#!/usr/bin/env python
# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

'''This utility cleans up the html files as emitted by doxygen so
that they are suitable for publication on a Google documentation site.
'''

import argparse
import glob
import os
import re
import shutil
import sys

try:
  from BeautifulSoup import BeautifulSoup, Tag
except (ImportError, NotImplementedError):
  print ("This tool requires the BeautifulSoup package "
         "(see http://www.crummy.com/software/BeautifulSoup/).\n"
         "Make sure that the file BeautifulSoup.py is either in this directory "
         "or is available in your PYTHON_PATH")
  raise


def Trace(msg):
  if Trace.verbose:
    sys.stderr.write(str(msg) + '\n')

Trace.verbose = False


FILES_TO_REMOVE = [
  '*.css',
  '*.map',
  '*.md5',
  'annotated.html',
  'bc_s.png',
  'classes.html',
  'closed.png',
  'doxygen.png',
  'files.html',
  'functions*.html',
  'globals_0x*.html',
  'globals_enum.html',
  'globals_eval.html',
  'globals_func.html',
  'globals.html',
  'globals_type.html',
  'globals_vars.html',
  'graph_legend.html',
  'graph_legend.png',
  'hierarchy.html',
  'index_8dox.html',
  'index.html',
  'modules.html',
  'namespacemembers_func.html',
  'namespacemembers.html',
  'namespaces.html',
  'nav_f.png',
  'nav_h.png',
  'open.png',
  'tab_a.png',
  'tab_b.png',
  'tab_h.png',
  'tab_s.png',
]


class HTMLFixer(object):
  '''This class cleans up the html strings as produced by Doxygen
  '''

  def __init__(self, html):
    self.soup = BeautifulSoup(html)

  def FixTableHeadings(self):
    '''Fixes the doxygen table headings.

    This includes:
      - Using bare <h2> title row instead of row embedded in <tr><td> in table
      - Putting the "name" attribute into the "id" attribute of the <tr> tag.
      - Splitting up tables into multiple separate tables if a table
        heading appears in the middle of a table.

    For example, this html:
     <table>
      <tr><td colspan="2"><h2><a name="pub-attribs"></a>
      Data Fields List</h2></td></tr>
      ...
     </table>

    would be converted to this:
     <h2>Data Fields List</h2>
     <table>
      ...
     </table>
    '''

    table_headers = []
    for tag in self.soup.findAll('tr'):
      if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
        #tag['id'] = tag.td.h2.a['name']
        tag.string = tag.td.h2.a.next
        tag.name = 'h2'
        table_headers.append(tag)

    # reverse the list so that earlier tags don't delete later tags
    table_headers.reverse()
    # Split up tables that have multiple table header (th) rows
    for tag in table_headers:
      Trace("Header tag: %s is %s" % (tag.name, tag.string.strip()))
      # Is this a heading in the middle of a table?
      if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
        Trace("Splitting Table named %s" % tag.string.strip())
        table = tag.parent
        table_parent = table.parent
        table_index = table_parent.contents.index(table)
        new_table = Tag(self.soup, name='table', attrs=table.attrs)
        table_parent.insert(table_index + 1, new_table)
        tag_index = table.contents.index(tag)
        for index, row in enumerate(table.contents[tag_index:]):
          new_table.insert(index, row)
      # Now move the <h2> tag to be in front of the <table> tag
      assert tag.parent.name == 'table'
      table = tag.parent
      table_parent = table.parent
      table_index = table_parent.contents.index(table)
      table_parent.insert(table_index, tag)

  def RemoveTopHeadings(self):
    '''Removes <div> sections with a header, tabs, or navpath class attribute'''
    header_tags = self.soup.findAll(
        name='div',
        attrs={'class' : re.compile('^(header|tabs[0-9]*|navpath)$')})
    [tag.extract() for tag in header_tags]

  def RemoveVersionNumbers(self, html):
    '''Horrible hack to strip _#_# from struct names.'''
    return re.sub(r'(_\d_\d)(?=[": <])', '', html)

  def FixAll(self):
    self.FixTableHeadings()
    self.RemoveTopHeadings()
    html = str(self.soup)
    html = self.RemoveVersionNumbers(html)
    return html


def main(args):
  """Main entry for the doxy_cleanup utility

  doxy_cleanup cleans up the html files generated by doxygen.
  """

  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument('-v', '--verbose', help='verbose output.',
                      action='store_true')
  parser.add_argument('directory')

  options = parser.parse_args(args)

  if options.verbose:
    Trace.verbose = True

  root_dir = options.directory
  html_dir = os.path.join(root_dir, 'html')

  # Doxygen puts all files in an 'html' directory.
  # First, move all files from that directory to root_dir.
  for filename in glob.glob(os.path.join(html_dir, '*')):
    Trace('Moving %s -> %s' % (filename, root_dir))
    shutil.move(filename, root_dir)

  # Now remove the 'html' directory.
  Trace('Removing %s' % html_dir)
  os.rmdir(html_dir)

  # Then remove unneeded files.
  for wildcard in FILES_TO_REMOVE:
    Trace('Removing "%s":' % wildcard)
    path = os.path.join(root_dir, wildcard)
    for filename in glob.glob(path):
      Trace('  Removing "%s"' % filename)
      os.remove(filename)

  # Now, fix the HTML files we've kept.
  Trace('Fixing HTML files...')
  for root, _, files in os.walk(root_dir):
    for filename in files:
      if not os.path.splitext(filename)[1] == '.html':
        Trace('Skipping %s' % filename)
        continue

      filename = os.path.join(root, filename)
      Trace('Processing "%s"...' % filename)
      try:
        with open(filename) as f:
          html = f.read()

        fixer = HTMLFixer(html)
        output = fixer.FixAll()
        with open(filename, 'w') as f:
          f.write(output)
      except:
        sys.stderr.write("Error while processing %s\n" % filename)
        raise

  return 0

if __name__ == '__main__':
  try:
    rtn = main(sys.argv[1:])
  except KeyboardInterrupt:
    sys.stderr.write('%s: interrupted\n' % os.path.basename(__file__))
    rtn = 1
  sys.exit(rtn)
